Primary Models
consider including a pic of error/sample size to justify not imputing but point out i explored it
K-Nearest Neighbours
**acknowledge limitations of CV in that i chose feats before CV, not to worry, it wont
#load dependencies
require(png)
require(plyr)
require(dplyr)
require(osmdata)
require(Hmisc)
require(usedist)
require(leaflet)
require(sf)
require(geosphere)
require(doParallel)
require(foreach)
require(plotly)
require(knitr)
require(gplots)
The data have been downloaded from the online property website domain.com.au by a member of the online data science community kaggle.com, you can see the post here. Thank you to user anthonypino.
#----read in data
property_data <- read.csv('original_data/Melbourne_housing_FULL.csv')
#----rename columns
names(property_data) <- c('suburb','add','nrooms','type','price','method','agent','date',
'precomputeddist','pc','nbedroom','nbathroom','ncar','land_area',
'building_area','year_built','council_area','lat','lng','region',
'propcount')
#----recast types
property_data$date = as.Date(property_data$date,format='%d/%m/%Y')
## Warning in strptime(x, format, tz = "GMT"): unable to identify current timezone 'C':
## please set environment variable 'TZ'
property_data$suburb = property_data$suburb %>%
as.character %>% capitalize %>% as.factor
property_data$add = as.character(property_data$add)
property_data$propcount = as.numeric(property_data$propcount)
property_data$precomputeddist = as.numeric(property_data$precomputeddist)
#----compute distance from city
MELBOURNE_CENTRE = c(144.9631,-37.8136)
locs = select(property_data,c(lng,lat))
property_data$dist_cbd = apply(locs,1,function(loc){distm(loc,MELBOURNE_CENTRE)})
#give every object an ID
property_data$ID = as.character(1:nrow(property_data))
After parsing and an ID column is appended to the data, the table has 34857 records of sales, each with 23 features. View the first 100 rows of the data here.
discuss the data in its default form
write here download the data yourself here
#parse government data
gov_median_data <- read.csv('other_data/suburb_house_2017_edited.csv',stringsAsFactors = F)
names(gov_median_data)[1] <- 'suburb'
gov_median_data$suburb <- gov_median_data$suburb %>% tolower %>% capitalize
#limit property_data to 2017
property_data_2017 <- subset(property_data,date<'2018-01-01' & date > '2016-12-31')
#limit to houses
property_data_2017 = property_data_2017[property_data_2017$type=='h',]
#select relevant columns
gov_median_data = gov_median_data %>% select(c(suburb,X2017))
#rename column
names(gov_median_data)[2] <- 'median_price'
property_data_2017 <- group_by(property_data_2017,suburb) %>%
summarise(median_price = median(price,na.rm=T),count=n())
#record how many properties were sold in this area, compared to total
property_data_2017$prop <- property_data_2017$count/sum(property_data_2017$count)
comparison_data = merge(property_data_2017,gov_median_data,by='suburb',suffixes = c('_pd','_gov'))
plot_ly(data=comparison_data,
x=~median_price_gov,
y=~median_price_pd,
type='scatter',
mode='markers',
text=paste0(comparison_data$suburb,'\n',comparison_data$count,' sales'),
hoverinfo='text'
) %>% add_lines(x=c(0,4e+6),y=c(0,4e+6),inherit = F) %>%
layout(showlegend=F,
xaxis=list('title'='Median Sale Price in our Dataset'),
yaxis=list('title'='Median'))
skip alot of this, or collapse not bold
fig <- readPNG('na_heatmap.png')
plot.new()
rasterImage(fig,0,0,1,1)
#----------------------lnglat
leaflet(property_data) %>% addTiles %>% addCircles(lng=~lng,lat=~lat,
opacity = 0.5,
fillOpacity = 0.5,
color='#0078D7',
fillColor='#0078D7'
)
#lng and lat have very few missing, dont wanna bother imputing
loc_bool <- !is.na(property_data$lng) & !(is.na(property_data$lat))
#----------------------year_built
#1 property was apparaently built in 1196 and another next century
to_correct = which(property_data$year_built > 2030| property_data$year_built < 1788)
property_data[to_correct,'year_built'] = NA
#--------------------ncar
ggplot(data= property_data, aes(x=ncar,y=log(price))) +
geom_jitter(alpha=0.2,color='#0078D7')
#limit the scope of analysis to properties with fewer than 7 car parks
ncar_bool <- is.na(property_data$ncar) | property_data$ncar<=6
#--------------------nbathroom
ggplot(property_data,aes(x=nbathroom,y=log(price))) +
geom_jitter(alpha=0.2,color='#0078D7')
#limit the analysis to properties with nbathroom <= 4
bath_bool <- !is.na(property_data$nbathroom) & (property_data$nbathroom<=4 & property_data$nbathroom>0)
#---------------------nrooms
ggplot(property_data,aes(x=nrooms,y=log(price))) +
geom_jitter(alpha=0.2,color='#0078D7')
#limit the analysis to properties with nrooms <=6
nroom_bool <- property_data$nrooms<=6 & property_data$nrooms>0
#---------------------nbedroom
sum(is.na(property_data$nbedroom))
## [1] 8217
has_bedroom = !is.na(property_data$nbedroom)
cor(property_data$nbedroom[has_bedroom],property_data$nrooms[has_bedroom])
## [1] 0.9467546
#dont include in analysis
#----------------------building_area
ggplot(property_data,aes(x=building_area,y=log(price))) +
geom_point(alpha=0.2,color='#0078D7')
#make ==0 NA
property_data[is.na(property_data$building_area) | property_data$building_area==0,'building_area'] = NA
#limit analysis to buildings with less than 1000 building_area
BA_bool <- is.na(property_data$building_area) | property_data$building_area<1000
#----------------------land_area
ggplot(property_data,aes(x=land_area,y=log(price))) +
geom_point(alpha=0.2,color='#0078D7')
#limit analysis to properties with land_area < 10000
land_bool <- is.na(property_data$land_area) | property_data$land_area < 10000
#---------------------actions
#gather bools of which rows can be kept, intersection will be kept
property_data <- property_data[nroom_bool & bath_bool & land_bool & BA_bool & ncar_bool &loc_bool,]
#replot
ggplot(property_data,aes(x=building_area,y=log(price))) +
geom_point(alpha=0.2,color='#0078D7')
ggplot(property_data,aes(x=land_area,y=log(price))) +
geom_point(alpha=0.2,color='#0078D7')
save(property_data,file='property_data_from_Rmd_file.Robject')
The validation scheme is as follows:
This scheme has two main benefits over other popular methods:
It is fair. explain this
It used all of the training/ensemble validation data for the final model. explain this
Validation Scheme Diagram
na_prop = sapply(property_data,function(x){mean(is.na(x))})
na_prop_df = data.frame('feature'=names(na_prop),'prop_na'=na_prop)
na_prop_df = na_prop_df[na_prop_df$prop_na>0,]
na_prop_df = na_prop_df[order(na_prop_df$prop_na,decreasing = T),]
na_prop_df$prop_na = round(na_prop_df$prop_na,2)
#write a
na_prop_df %>% kable(row.names=F,
col.names = c('Feature','Proportion of Objects with NA Values'))
| Feature | Proportion of Objects with NA Values |
|---|---|
| building_area | 0.49 |
| year_built | 0.42 |
| price | 0.22 |
| land_area | 0.14 |
| ncar | 0.02 |
note recon after validation
include some plots with the red coloring
dont harp on
consider including a pic of error/sample size to justify not imputing but point out i explored it
**acknowledge limitations of CV in that i chose feats before CV, not to worry, it wont